# importing the dataset
import numpy as np
import pandas as pd

df = pd.read_stata('PS06-Glewwe2016.dta')
df.head()


# dropping columns not needed for our project
df.drop(['countycode','towncode','schoolcode','idcode'], axis=1, inplace=True)
df.head()


df['age'] = (df['examdate'] - df['birthdate']) / 365
df.head()


df['acuity'] = (df['lefteye'] + df['righteye']) / 2


df.headeduc = df.headeduc.replace({1:16, 2:14, 3:12, 4:12, 5:12, 6:9, 7:6, 8:0})
df.head()


# we replace numeric values with the professions and convert those categorical variables into the dummy variables
df.headocc = df.headocc.replace({1:"farmer", 2:'worker', 3:'teacher', 4:'leader', 5:'other', 7:'other', 8:'other'})
prof = pd.get_dummies(pd.Series(list(df['headocc'])), drop_first=True)

# we'll set 'other' as a base 
prof.drop(['other'], axis=1, inplace=True)

# we add these values to our dataframe
df = pd.concat([df, prof], axis=1)
df.drop(['headocc'], axis=1, inplace=True)
df.head()


# one if the person had glasses before project, zero otherwise
df.glasses = df.glasses.replace({1:1, 2:0})


# 1. we generate the average test score
avetest = (df['chinese04s2'] + df['math04s2'] + df['science04s2']) / 3

# 2. create the mean and standard deviation variables
testmean = avetest.mean()
sdtest = avetest.std()

# 3. generate the z-score of the test
df['ztest'] = (avetest - testmean) / sdtest
df.head()


import statsmodels.formula.api as smf

reg0 = smf.ols('received ~ acuity + female + glasses + farmer + leader + teacher + worker + headeduc + ztest + townincpc',
               data=df).fit()
reg0.summary()


# we'll generate the interaction of eye acuity and female to compare the effect of eye acuity for females and males
reg1 = smf.ols('received ~ acuity + female + glasses + farmer + leader + teacher + worker + headeduc + ztest + townincpc + female*acuity',
               data=df).fit()
reg1.summary()


df[['acuity','female','ztest','glasses','teacher','leader']].describe()


# we replace NaN (empty) values with average values of that column
df.fillna({'acuity':4.59,'female':0.49,'ztest':0.000008,'glasses':0.04,'teacher':0.012,'leader':0.00096}, inplace=True)

# for the dependent variable we just replace NaN with 0
df['received'].fillna(0, inplace=True)


%%capture
#! pip install seaborn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics


# we plot the graph with a fitted line for acuity and acceptance probability
ax = sns.regplot(x="acuity", y="received", data=df, line_kws={"color":"cyan","alpha":0.3,"lw":4})
plt.ylabel('Probability', fontsize=12)
plt.ylim((-0.5,1.5))
plt.xlabel('Eye Acuity', fontsize=12)
ax.set_title('Linear Model',fontsize=16)

Text(0.5, 1.0, 'Linear Model')


from sklearn.linear_model import LinearRegression
df['acfe'] = df['acuity'] * df['female']
Xf = df[['acuity','female','acfe']]
linmodel = LinearRegression()
linmodel.fit(Xf,df[['received']])
print('coeficients of the model with interaction of female and eye acuity is:', linmodel.coef_)
print('constant term is:', linmodel.intercept_)

coeficients of the model with interaction of female and eye acuity is: [[-0.6829079   0.1353346  -0.03819171]]
constant term is: [3.8328638]


import seaborn as sns
ax = sns.regplot(x=df[['acuity']], y=df[['received']], data=df, logistic=True, line_kws={"color":"cyan","alpha":0.3,"lw":4})
ax.set_ylabel('Probability')
ax.set_xlabel('Eye Acuity')
ax.set_xlim(3.6,5.3)
ax.set_title('Logistic Model',fontsize=16)

Text(0.5, 1.0, 'Logistic Model')


# we chose the set of explanatory variables close to what we used for our linear regression
Xset = df[['acuity','female','ztest','glasses','teacher','leader']]


# we run the Logistic regression
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(Xset,df[['received']])
predictions = logmodel.predict(Xset)

print('explanatory variables coeficients are, as follows: ', logmodel.coef_)
print('intercept or constant of the model is:',logmodel.intercept_)

explanatory variables coeficients are, as follows:  [[-2.35578439 -0.15953785 -0.10125446  0.64987177 -0.33930451 -0.45937253]]
intercept or constant of the model is: [11.67654457]

F:\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
F:\Anaconda3\lib\site-packages\sklearn\utils\validation.py:724: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)


#this function will generate confusion matrix needed to evaluate success rate of our logistic model
from sklearn.metrics import classification_report, confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="red" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
print(confusion_matrix(df[['received']], predictions, labels=[1,0]))

[[2018  100]
 [ 752  264]]


# Compute confusion matrix
cnf_matrix = confusion_matrix(df[['received']], predictions, labels=[1,0])
np.set_printoptions(precision=2)


# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Received=1','Received=0'],normalize= False,  title='Confusion matrix')

Confusion matrix, without normalization
[[2018  100]
 [ 752  264]]


from sklearn.metrics import classification_report
print(classification_report(df[['received']], predictions))

              precision    recall  f1-score   support

           0       0.73      0.26      0.38      1016
           1       0.73      0.95      0.83      2118

    accuracy                           0.73      3134
   macro avg       0.73      0.61      0.60      3134
weighted avg       0.73      0.73      0.68      3134

Variable	Definition
countycode towncode schoolcode idcode	Codes for county, township, school, and student
grade	Student’s grade (1 to 5)
female	=1 for female, =0 for male
birthdate; examdate	Student’s birth date and exam date
lefteye; righteye	Student’s eye exam results for left and right eye. A measure between 4 and 5.2, with 4 unable to read any line (out of 12 lines) in the board and 5.2 being able to read all the lines
height; weight	Student’s height (centimeters) and weight (kilograms)
headeduc; headocc	Household head’s education (codes 1-8) and occupation (1-8); Education (=years): 1=16y; 2=14; 3,4,5=12y; 6=9y; 7=6y; 8=0y; Occupation: 1=farmer; 2=worker; 3=teacher; 6=village leader, 4,5,7,8=others.
glasses	=1 if the student had glasses before project started 2= if not
received	=1 if the student received glasses in project 0= otherwise
eligible	=1 if the student was considered as eligible to receive glasses =0 otherwise
offer	=1 if the student was offered glasses in the project =0 otherwise
chinese04s2 math04s2 science04s2	Test scores in Chinese, math, and science in 2004 (before the project started)
townincpc	Township income per capita

	countycode	grade	towncode	schoolcode	idcode	female	birthdate	examdate	lefteye	righteye	...	received	headeduc	headocc	glasses	chinese04s2	math04s2	science04s2	eligible	offer	townincpc
0	1.0	3.0	1.0	1.0	1.0	0	1994-05-01	2004-06-01	4.8	4.7	...	0	6.0	1	2	83.0	81.0	76.0	1.0	1.0	2200.0
1	1.0	3.0	1.0	1.0	5.0	0	1994-09-01	2004-06-01	4.8	4.7	...	1	5.0	5	2	85.0	78.0	75.0	1.0	1.0	2200.0
2	1.0	3.0	1.0	1.0	13.0	1	1994-01-01	2004-05-31	4.7	4.6	...	1	6.0	1	2	70.0	73.0	70.0	1.0	1.0	2200.0
3	1.0	3.0	1.0	1.0	25.0	0	1994-02-01	2004-06-01	4.4	4.3	...	1	6.0	1	2	86.0	91.0	83.0	1.0	1.0	2200.0
4	1.0	3.0	1.0	1.0	29.0	0	1994-02-01	2004-06-01	4.7	4.7	...	1	6.0	1	2	87.0	98.0	89.0	1.0	1.0	2200.0

	grade	female	birthdate	examdate	lefteye	righteye	height	weight	received	headeduc	headocc	glasses	chinese04s2	math04s2	science04s2	eligible	offer	townincpc
0	3.0	0	1994-05-01	2004-06-01	4.8	4.7	136.0	24.0	0	6.0	1	2	83.0	81.0	76.0	1.0	1.0	2200.0
1	3.0	0	1994-09-01	2004-06-01	4.8	4.7	133.0	26.0	1	5.0	5	2	85.0	78.0	75.0	1.0	1.0	2200.0
2	3.0	1	1994-01-01	2004-05-31	4.7	4.6	135.0	28.0	1	6.0	1	2	70.0	73.0	70.0	1.0	1.0	2200.0
3	3.0	0	1994-02-01	2004-06-01	4.4	4.3	145.0	34.5	1	6.0	1	2	86.0	91.0	83.0	1.0	1.0	2200.0
4	3.0	0	1994-02-01	2004-06-01	4.7	4.7	148.0	33.0	1	6.0	1	2	87.0	98.0	89.0	1.0	1.0	2200.0

	grade	female	birthdate	examdate	lefteye	righteye	height	weight	received	headeduc	headocc	glasses	chinese04s2	math04s2	science04s2	eligible	offer	townincpc	age
0	3.0	0	1994-05-01	2004-06-01	4.8	4.7	136.0	24.0	0	6.0	1	2	83.0	81.0	76.0	1.0	1.0	2200.0	10 days 02:14:08.219178
1	3.0	0	1994-09-01	2004-06-01	4.8	4.7	133.0	26.0	1	5.0	5	2	85.0	78.0	75.0	1.0	1.0	2200.0	9 days 18:08:52.602739
2	3.0	1	1994-01-01	2004-05-31	4.7	4.6	135.0	28.0	1	6.0	1	2	70.0	73.0	70.0	1.0	1.0	2200.0	10 days 10:03:36.986301
3	3.0	0	1994-02-01	2004-06-01	4.4	4.3	145.0	34.5	1	6.0	1	2	86.0	91.0	83.0	1.0	1.0	2200.0	10 days 08:05:15.616438
4	3.0	0	1994-02-01	2004-06-01	4.7	4.7	148.0	33.0	1	6.0	1	2	87.0	98.0	89.0	1.0	1.0	2200.0	10 days 08:05:15.616438

	grade	female	birthdate	examdate	lefteye	righteye	height	weight	received	headeduc	headocc	glasses	chinese04s2	math04s2	science04s2	eligible	offer	townincpc	age	acuity
0	3.0	0	1994-05-01	2004-06-01	4.8	4.7	136.0	24.0	0	9.0	1	2	83.0	81.0	76.0	1.0	1.0	2200.0	10 days 02:14:08.219178	4.75
1	3.0	0	1994-09-01	2004-06-01	4.8	4.7	133.0	26.0	1	12.0	5	2	85.0	78.0	75.0	1.0	1.0	2200.0	9 days 18:08:52.602739	4.75
2	3.0	1	1994-01-01	2004-05-31	4.7	4.6	135.0	28.0	1	9.0	1	2	70.0	73.0	70.0	1.0	1.0	2200.0	10 days 10:03:36.986301	4.65
3	3.0	0	1994-02-01	2004-06-01	4.4	4.3	145.0	34.5	1	9.0	1	2	86.0	91.0	83.0	1.0	1.0	2200.0	10 days 08:05:15.616438	4.35
4	3.0	0	1994-02-01	2004-06-01	4.7	4.7	148.0	33.0	1	9.0	1	2	87.0	98.0	89.0	1.0	1.0	2200.0	10 days 08:05:15.616438	4.70

Problem Solving 03¶

Replication of a paper published by Paul Glewwe, Albert Park, Meng Zhao - "A better vision for development: Eyeglasses and academic performance in rural primary schools in China"¶

Original paper, published by Journal of Development Economics in 2016, you can download from here and dataset can be downloaded from here ¶

In this assignment, we'll work on a paper that analyzes the educational effect of providing eyeglasses to students in China in an experimental setting. We discuss the second part of the paper that analyzes students’ choices over accepting eyeglasses (Part 8 and table 8).¶

Part 1:¶

Part 2:¶

We'll analyze the effect of eye acuity on the probability of accepting the offer for male versus female students.¶

Part 3:¶

We'll try to visualize the model based on logistic model.¶

Now we make the input of other parameters into our model; close to what we used for a linear regression¶

Next, we evaluate the accuracy of our predictions by generating a classification report¶

Dep. Variable:	received	R-squared:	0.183
Model:	OLS	Adj. R-squared:	0.180
Method:	Least Squares	F-statistic:	68.68
Date:	Fri, 27 Dec 2019	Prob (F-statistic):	6.65e-127
Time:	11:40:11	Log-Likelihood:	-1723.6
No. Observations:	3085	AIC:	3469.
Df Residuals:	3074	BIC:	3535.
Df Model:	10
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	3.7582	0.139	27.080	0.000	3.486	4.030
acuity	-0.6945	0.028	-24.922	0.000	-0.749	-0.640
female	-0.0387	0.015	-2.535	0.011	-0.069	-0.009
glasses	0.1108	0.039	2.863	0.004	0.035	0.187
farmer	0.1124	0.031	3.621	0.000	0.052	0.173
leader	-0.1537	0.247	-0.623	0.533	-0.637	0.330
teacher	0.0204	0.076	0.269	0.788	-0.128	0.169
worker	-0.0362	0.060	-0.600	0.548	-0.154	0.082
headeduc	-0.0003	0.003	-0.111	0.912	-0.007	0.006
ztest	-0.0199	0.008	-2.537	0.011	-0.035	-0.005
townincpc	1.665e-05	1.55e-05	1.072	0.284	-1.38e-05	4.71e-05

Omnibus:	710.500	Durbin-Watson:	1.234
Prob(Omnibus):	0.000	Jarque-Bera (JB):	260.956
Skew:	-0.523	Prob(JB):	2.16e-57
Kurtosis:	2.032	Cond. No.	5.28e+04

Omnibus:	700.308	Durbin-Watson:	1.234
Prob(Omnibus):	0.000	Jarque-Bera (JB):	260.393
Skew:	-0.523	Prob(JB):	2.86e-57
Kurtosis:	2.036	Cond. No.	6.40e+04

	acuity	female	ztest	glasses	teacher	leader
count	3126.000000	3134.000000	3094.000000	3134.000000	3134.000000	3134.000000
mean	4.594357	0.485322	0.000008	0.041161	0.012444	0.000957
std	0.274305	0.499864	1.000002	0.198695	0.110875	0.030929
min	3.600000	0.000000	-7.133553	0.000000	0.000000	0.000000
25%	4.400000	0.000000	-0.550634	0.000000	0.000000	0.000000
50%	4.650000	0.000000	0.065312	0.000000	0.000000	0.000000
75%	4.800000	1.000000	0.681258	0.000000	0.000000	0.000000
max	5.200000	1.000000	2.182625	1.000000	1.000000	1.000000

Problem Solving 03¶

Replication of a paper published by Paul Glewwe, Albert Park, Meng Zhao - "A better vision for development: Eyeglasses and academic performance in rural primary schools in China"¶

Original paper, published by Journal of Development Economics in 2016, you can download from here and dataset can be downloaded from here¶

In this assignment, we'll work on a paper that analyzes the educational effect of providing eyeglasses to students in China in an experimental setting. We discuss the second part of the paper that analyzes students’ choices over accepting eyeglasses (Part 8 and table 8).¶

Part 1:¶

Part 2:¶

We'll analyze the effect of eye acuity on the probability of accepting the offer for male versus female students.¶

Part 3:¶

We'll try to visualize the model based on logistic model.¶

Now we make the input of other parameters into our model; close to what we used for a linear regression¶

Next, we evaluate the accuracy of our predictions by generating a classification report¶

Original paper, published by Journal of Development Economics in 2016, you can download from here and dataset can be downloaded from here ¶